import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
df = pd.read_csv('./data/Movie Rating Dataset.csv')
# Keeps the relevant columns
print(f'cols before: {df.columns}')
df = df[['Title','Genre','Tags', 'Languages','Series or Movie','Runtime','Director','Writer','Actors','Release Date','Summary', 'IMDb Score']]
print(f'After changing columns: {df.columns}')
cols before: Index(['Title', 'Genre', 'Tags', 'Languages', 'Series or Movie',
'Hidden Gem Score', 'Country Availability', 'Runtime', 'Director',
'Writer', 'Actors', 'View Rating', 'IMDb Score',
'Rotten Tomatoes Score', 'Metacritic Score', 'Awards Received',
'Awards Nominated For', 'Boxoffice', 'Release Date',
'Netflix Release Date', 'Production House', 'Netflix Link', 'IMDb Link',
'Summary', 'IMDb Votes', 'Image', 'Poster', 'TMDb Trailer',
'Trailer Site'],
dtype='object')
After changing columns: Index(['Title', 'Genre', 'Tags', 'Languages', 'Series or Movie', 'Runtime',
'Director', 'Writer', 'Actors', 'Release Date', 'Summary',
'IMDb Score'],
dtype='object')
# How many instances?
print(f'How many instances: {df.shape[0]}')
# How many nulls at 'IMDb Score'
print(f'How many nulls in IMDb Score col: {df['IMDb Score'].isnull().sum()}')
# Dropping rows with null value at IMDb Score col
df.dropna(subset=['IMDb Score'], inplace=True)
# number of instances after removing null value at IMDb Score col
print(f'How many instances after dropping nulls in IMDb Score col: {df.shape[0]}')
How many instances: 15480 How many nulls in IMDb Score col: 2099 How many instances after dropping nulls in IMDb Score col: 13381
import seaborn as sns
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x="IMDb Score", stat='probability', bins=87, kde=True)
plt.title("IMDb Score's Distribution")
plt.savefig('./results/IMDb Score Distribution.png', facecolor='white', edgecolor='white')
plt.show()
# Get unique values and sort them in ascending order
sorted_unique_scores = sorted(df['IMDb Score'].unique())
print(sorted_unique_scores)
[1.0, 1.4, 1.5, 1.6, 1.7, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.7]
# plot a table of: mean, median, mode, q1, q2, variance
# Calculate descriptive statistics
mean = round(df['IMDb Score'].mean(),3)
median = df['IMDb Score'].median()
mode = df['IMDb Score'].mode().values[0] if not df['IMDb Score'].mode().empty else float('nan')
q1 = df['IMDb Score'].quantile(0.25)
q2 = df['IMDb Score'].quantile(0.75)
variance = round(df['IMDb Score'].var(),3)
standard_deviation = round(df['IMDb Score'].std(), 3)
# Prepare data for the table
descriptive_stats = pd.DataFrame({
'Statistic': ['Mean', 'Median', 'Mode', 'Q1', 'Q3', 'Variance', 'Standard deviation'],
'Value': [mean, median, mode, q1, q2, variance, standard_deviation]
})
# Create a Plotly table
fig = go.Figure(data=[go.Table(
header=dict(values=list(descriptive_stats.columns),
fill_color= '#636EFA',
# fill_color='paleturquoise',
align='left',
font=dict(color='black', size=15)),
cells=dict(values=[descriptive_stats.Statistic, descriptive_stats.Value],
fill_color='lavender',
align='left',
height=25,
font=dict(color='black', size=14)))
])
fig.update_layout(template='plotly_white',
width=500,
title={'text': 'Descriptive Statistics - IMDB Score (Target)',
'y': 0.85,
'x': 0.5,
'xanchor': 'center',
'font': dict(color='black',weight='bold')}
)
config = {
'toImageButtonOptions': {
'filename': 'Descriptive Statistics - Rating',
}
}
fig.show(config=config)
# Duplicates - Identify Duplicates based on 'Title', 'Release Date', and 'Series or Movie'
duplicates = df[df.duplicated(subset=['Title', 'Release Date', 'Series or Movie'], keep='first')]
num_duplicates = len(duplicates)
print(f'Number of duplicate rows identified: {num_duplicates}')
# Drop duplicates
df.drop_duplicates(subset=['Title', 'Release Date', 'Series or Movie'], keep='first', inplace=True)
df.reset_index(drop=True, inplace=True)
data_len_after = len(df)
print(f'Total number of rows after dropping duplicates: {data_len_after}')
Number of duplicate rows identified: 82 Total number of rows after dropping duplicates: 13299
# validate there are no duplicates
has_duplicates = df.duplicated(subset=['Title', 'Release Date', 'Series or Movie'], keep='first').any()
if has_duplicates:
print('there are duplicates')
else:
print('there are no duplicates')
there are no duplicates
df.columns
Index(['Title', 'Genre', 'Tags', 'Languages', 'Series or Movie', 'Runtime',
'Director', 'Writer', 'Actors', 'Release Date', 'Summary',
'IMDb Score'],
dtype='object')
# Predictors - Null Values
column_info = []
total_rows = len(df)
# create the table
for col in df.columns:
col_name = col
col_nulls = df[col].isnull().sum()
col_unique = df[col].nunique()
col_null_percent = (col_nulls / total_rows) * 100
# adding the columns
column_info.append({
'Column Name': col_name,
'Unique Values': col_unique,
'Null Percent': round(col_null_percent, 2)
})
column_info_df = pd.DataFrame(column_info)
# plot the table
fig = go.Figure(data=[go.Table(
header=dict(values=list(column_info_df.columns),
fill_color= '#636EFA',
align='left',
font=dict(color='black', size=15)),
cells=dict(values=[column_info_df['Column Name'], column_info_df['Unique Values'], column_info_df['Null Percent']],
fill_color='lavender',
align='left',
height=25,
font=dict(color='black', size=13)))
])
fig.update_layout(template='plotly_white',
width=550,
height= 550,
title={'text': 'Predictors - Null Values',
'y': 0.85,
'x': 0.5,
'xanchor': 'center',
'font': dict(color='black',weight='bold')}
)
fig.show()
# Plotting the columns with missing values
missing_values = df.isnull().sum()
fig, ax = plt.subplots(figsize=(14, 8))
fig.patch.set_facecolor('white')
ax.set_facecolor('white')
# bar plot
missing_values.plot(kind='bar', color='skyblue', ax=ax)
ax.set_title('Count of Missing Values in Each Column')
ax.set_xlabel('Columns')
ax.set_ylabel('Number of Missing Values')
ax.set_xticks(range(len(missing_values.index)))
ax.set_xticklabels(missing_values.index, rotation=45)
ax.grid(axis='y', linestyle='--', alpha=0.7)
plt.savefig('./results/Missing Values Distribution.png', bbox_inches='tight', facecolor=fig.get_facecolor())
plt.show()
Extract month and year, and plot each independently
# convert to datetime type
df['Release Date'] = pd.to_datetime(df['Release Date'])
# extract month and year out of 'Release Date'
df['released_day'] = df['Release Date'].dt.day
print(f'head day: {df['released_day'].head()}')
df['released_month'] = df['Release Date'].dt.month
print(f'head month: {df['released_month'].head()}')
df['released_year'] = df['Release Date'].dt.year
print(f'head year: {df['released_year'].head()}')
# convert to int
df['released_day'] = df['released_day'].astype('Int64')
df['released_month'] = df['released_month'].astype('Int64')
df['released_year'] = df['released_year'].astype('Int64')
print(df['released_day'].dtype)
head day: 0 12.0 1 8.0 2 28.0 3 1.0 4 22.0 Name: released_day, dtype: float64 head month: 0 12.0 1 5.0 2 8.0 3 10.0 4 9.0 Name: released_month, dtype: float64 head year: 0 2008.0 1 2020.0 2 2020.0 3 2016.0 4 2011.0 Name: released_year, dtype: float64 Int64
# check if the data is obsolete
print(f'unique sorter years: \n {sorted(df['released_year'].dropna().unique())}')
print(f'Minimum year: {df['released_year'].min()}')
print(f'Maximum year: {df['released_year'].max()}')
unique sorter years: [1910, 1913, 1915, 1916, 1918, 1920, 1921, 1923, 1924, 1925, 1927, 1928, 1929, 1930, 1931, 1932, 1934, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021] Minimum year: 1910 Maximum year: 2021
# Plot avg rating by month
monthly_data = df.groupby('released_month')['IMDb Score'].mean().reset_index()
fig = px.bar(monthly_data, x='released_month', y='IMDb Score', text='IMDb Score', title='Average Rating by Released Month')
fig.update_traces(
textposition= 'outside',
texttemplate='%{text:.2f}',
textfont_size= 16,
marker_color= '#636EFA',
)
unique_scores = sorted(df['IMDb Score'].unique())
fig.update_layout(
xaxis_title = 'Released Month',
yaxis_title = 'Avg IMDb Score',
title_x=0.5,
title_y=0.85,
xaxis_title_font=dict(size=17),
yaxis_title_font=dict(size=17),
title_font=dict(size=22, family='Arial Black'),
width = 650,
height = 400,
xaxis=dict(tickmode='linear',
tickfont=dict(size=14)),
yaxis=dict(tickmode='linear',
tickvals=unique_scores,
range=[min(unique_scores)-0.5, max(unique_scores)+0.5],
tickfont=dict(size=14)
),
template='plotly_white',
)
config = {
'toImageButtonOptions': {
'filename': 'Rating and Month'
}
}
fig.show(config=config)
# Plot avg rating by year
yearly_data = df.groupby('released_year')['IMDb Score'].mean().reset_index()
# keeps only the top 15
yearly_data = yearly_data.sort_values(by='released_year', ascending=False).head(15)
fig = px.bar(yearly_data, x='released_year', y='IMDb Score', text='IMDb Score', title='Average Rating by Released Year')
fig.update_traces(
textposition= 'outside',
texttemplate='%{text:.2f}',
textfont_size= 16,
marker_color= '#636EFA',
)
unique_scores = sorted(df['IMDb Score'].unique())
fig.update_layout(
xaxis_title = 'Released Year',
yaxis_title = 'Avg IMDb Score',
title_x=0.5,
title_y=0.85,
width = 650,
height = 400,
xaxis_title_font=dict(size=17),
yaxis_title_font=dict(size=17),
title_font=dict(size=22, family='Arial Black'),
xaxis=dict(tickmode='linear',
tickfont=dict(size=14),
tickangle=45
),
yaxis=dict(tickmode='linear',
tickvals=unique_scores,
range=[min(unique_scores)-0.5, max(unique_scores)+0.5],
tickfont=dict(size=14)
),
template='plotly_white',
)
config = {
'toImageButtonOptions': {
'filename': 'Rating and Year'
}
}
fig.show(config=config)
sorted_unique_years = sorted(df['released_year'].dropna().unique())
print(sorted_unique_years)
[1910, 1913, 1915, 1916, 1918, 1920, 1921, 1923, 1924, 1925, 1927, 1928, 1929, 1930, 1931, 1932, 1934, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
# maybe by day?
daily_data = df.groupby('released_day')['IMDb Score'].mean().reset_index()
# keeps only the top 15
# yearly_data = yearly_data.sort_values(by='released_day', ascending=False).head(15)
fig = px.bar(daily_data, x='released_day', y='IMDb Score', text='IMDb Score', title='Average Rating by Released Day')
fig.update_traces(
textposition= 'outside',
texttemplate='%{text:.2f}',
textfont_size= 16,
marker_color= '#636EFA',
)
unique_scores = sorted(df['IMDb Score'].unique())
fig.update_layout(
xaxis_title = 'Released Day',
yaxis_title = 'Avg IMDb Score',
title_x=0.5,
xaxis_title_font=dict(size=17),
yaxis_title_font=dict(size=17),
title_font=dict(size=22),
xaxis=dict(tickmode='linear',
tickfont=dict(size=14)),
yaxis=dict(tickmode='linear',
tickvals=unique_scores,
range=[min(unique_scores)-0.5, max(unique_scores)+0.5],
tickfont=dict(size=14)
),
template='plotly_white',
)
config = {
'toImageButtonOptions': {
'filename': 'Rating and Day'
}
}
fig.show(config=config)
df.columns
Index(['Title', 'Genre', 'Tags', 'Languages', 'Series or Movie', 'Runtime',
'Director', 'Writer', 'Actors', 'Release Date', 'Summary', 'IMDb Score',
'released_day', 'released_month', 'released_year'],
dtype='object')
# Drop rows where 'Runtime' or 'IMDb Score' is NaN
df_clean = df.dropna(subset=['Runtime', 'IMDb Score'])
# Grouping data by Runtime
runtime_data = df.groupby('Runtime')['IMDb Score'].mean().reset_index()
# Creating the bar plot
fig = px.bar(runtime_data, x='Runtime', y='IMDb Score', text='IMDb Score', title='Average Rating by Runtime')
fig.update_traces(
textposition='outside',
texttemplate='%{text:.2f}',
textfont_size=16,
marker_color='#636EFA',
)
unique_scores = sorted(df['IMDb Score'].unique())
fig.update_layout(
xaxis_title='Runtime',
yaxis_title='Avg IMDb Score',
title_x=0.5,
title_y=0.85,
xaxis_title_font=dict(size=17),
yaxis_title_font=dict(size=17),
title_font=dict(size=22, family='Arial Black'),
width=650,
height=400,
xaxis=dict(
tickmode='linear',
tickfont=dict(size=14),
tickangle=45 # Rotate the x-axis labels by 45 degrees if needed
),
yaxis=dict(
tickmode='linear',
tickvals=unique_scores,
range=[min(unique_scores) - 0.5, max(unique_scores) + 0.5],
tickfont=dict(size=14)
),
template='plotly_white',
)
config = {
'toImageButtonOptions': {
'filename': 'Rating and Runtime'
}
}
fig.show(config=config)
# Grouping data by both Runtime and Series of Movie
runtime_series_data = df.groupby(['Runtime', 'Series or Movie'])['IMDb Score'].mean().reset_index()
# Creating the bar plot
fig = px.bar(runtime_series_data, x='Runtime', y='IMDb Score', color='Series or Movie',
text='IMDb Score', title='Average Rating by Runtime and Series or Movie')
fig.update_traces(
textposition='outside',
texttemplate='%{text:.2f}',
textfont_size=16,
)
unique_scores = sorted(df['IMDb Score'].unique())
fig.update_layout(
xaxis_title='Runtime',
yaxis_title='Avg IMDb Score',
title_x=0.5,
title_y=0.85,
barmode='group', # Place bars next to each other instead of stacking
xaxis_title_font=dict(size=17),
yaxis_title_font=dict(size=17),
title_font=dict(size=22, family='Arial Black'),
width=650,
height=500,
xaxis=dict(
tickmode='linear',
tickfont=dict(size=14),
tickangle=45 # Rotate the x-axis labels by 45 degrees if needed
),
yaxis=dict(
tickmode='linear',
tickvals=unique_scores,
range=[min(unique_scores) - 0.5, max(unique_scores) + 0.5],
tickfont=dict(size=14)
),
template='plotly_white',
)
config = {
'toImageButtonOptions': {
'filename': 'Rating_Runtime_Series'
}
}
fig.show(config=config)
df.columns
Index(['Title', 'Genre', 'Tags', 'Languages', 'Series or Movie', 'Runtime',
'Director', 'Writer', 'Actors', 'Release Date', 'Summary', 'IMDb Score',
'released_day', 'released_month', 'released_year'],
dtype='object')
# Avg Rating by Film Type
film_type_mean_rating = df.groupby('Series or Movie')['IMDb Score'].mean().reset_index()
fig = px.bar(film_type_mean_rating, x='Series or Movie', y='IMDb Score', text='IMDb Score', title='Average Rating by Film Type')
fig.update_traces(
textposition= 'outside',
texttemplate='%{text:.2f}',
textfont_size= 16,
marker_color= '#636EFA',
)
unique_scores = sorted(df['IMDb Score'].unique())
fig.update_layout(
xaxis_title = 'Film Type',
yaxis_title = 'Avg IMDb Score',
title_x=0.5,
title_y=0.85,
width = 550,
xaxis_title_font=dict(size=17),
yaxis_title_font=dict(size=17),
title_font=dict(size=22, family='Arial Black'),
xaxis=dict(tickmode='linear',
tickfont=dict(size=14)),
yaxis=dict(tickmode='linear',
tickvals=unique_scores,
range=[min(unique_scores)-0.5, max(unique_scores)+0.5],
tickfont=dict(size=14)
),
template='plotly_white',
)
config = {
'toImageButtonOptions': {
'filename': 'Rating and Film Type'
}
}
fig.show(config=config)
sns.set_theme(style="white")
sns.kdeplot(data=df, x="IMDb Score", hue="Series or Movie", fill=True, alpha=0.4, linewidth=1.5)
# Add a title and labels to the plot using Matplotlib
plt.title("Rating Distribution by Film Type")
plt.xlabel("")
plt.ylabel("Density")
plt.savefig("./results/Rating Distribution by Film Type.png", dpi=300)
plt.show()
# Box plot
fig = px.box(df, x="Series or Movie", y="IMDb Score", title="Box Plot of IMDb Ratings by Film's Type")
fig.update_layout(
template='plotly_white',
xaxis_title = "Film's Type",
yaxis_title = 'Avg IMDb Score',
title_x=0.5,
title_y=0.85,
width=550,
xaxis_title_font=dict(size=17),
yaxis_title_font=dict(size=17),
title_font=dict(size=22),
xaxis=dict(tickfont=dict(size=14)),
yaxis=dict(tickfont=dict(size=14)),
)
config = {
'toImageButtonOptions': {
'filename': 'Box Plot - Rating and Film type'
}
}
fig.show(config=config)
# Box plot
fig = px.box(df, x="Runtime", y="IMDb Score", title="Box Plot of IMDb Ratings by Runtime")
fig.update_layout(
template='plotly_white',
xaxis_title = "Film's Runtime",
yaxis_title = 'Avg IMDb Score',
title_x=0.5,
title_y=0.85,
width=550,
xaxis_title_font=dict(size=17),
yaxis_title_font=dict(size=17),
title_font=dict(size=22),
xaxis=dict(tickfont=dict(size=14)),
yaxis=dict(tickfont=dict(size=14)),
)
config = {
'toImageButtonOptions': {
'filename': 'Box Plot - Rating and Runtime'
}
}
fig.show(config=config)
genre_data = df.copy()
genre_data.dropna(subset=['Genre'], inplace=True)
# return processed text
def process_col(col):
if pd.isna(col):
return col
col = [word.lower().strip() for word in col.split(',')]
return ', '.join(col)
genre_data['Genre'] = genre_data['Genre'].apply(process_col)
# calculate the percentage of each value
genre_frequency = genre_data['Genre'].value_counts() / len(genre_data) * 100
genre_frequency = genre_frequency.reset_index()
print(genre_frequency.columns)
genre_frequency.columns = ['Genre', 'Percentage']
print(genre_frequency.columns)
genre_percentage = genre_frequency.sort_values(by='Percentage', ascending=False)
# save the top 20
top_20_genres = genre_frequency.head(20)
Index(['Genre', 'count'], dtype='object') Index(['Genre', 'Percentage'], dtype='object')
fig = px.bar(top_20_genres, x='Genre', y='Percentage', text='Percentage', title='20 Most Frequent Genre Combinations')
fig.update_traces(
textposition='outside',
texttemplate='%{text:.2f}%',
textfont_size=16,
marker_color='#636EFA',
)
fig.update_layout(
xaxis_title = 'Genres',
yaxis_title = 'Percentage',
title_x=0.5,
title_y=0.85,
xaxis_title_font=dict(size=17),
yaxis_title_font=dict(size=17),
title_font=dict(size=22),
xaxis=dict(tickmode='linear',
tickfont=dict(size=14)),
yaxis=dict(tickmode='linear',
tickvals=unique_scores,
range=[min(unique_scores)-0.5, max(unique_scores)+0.5],
tickfont=dict(size=14)
),
template='plotly_white',
)
config = {
'toImageButtonOptions': {
'filename': '20 Most frequent Genre Combinations'
}
}
fig.show(config=config)
# Plotting Avg Rating by Genre
genre_mean_imdb = genre_data.groupby('Genre')['IMDb Score'].mean().reset_index()
print(genre_mean_imdb.columns)
genre_mean_imdb.columns = ['Genre', 'Avg IMDb Score']
genre_stats = pd.merge(genre_percentage, genre_mean_imdb, on='Genre')
top_30_genres = genre_stats.head(20)
Index(['Genre', 'IMDb Score'], dtype='object')
fig = px.bar(top_30_genres, x='Genre', y='Avg IMDb Score', text='Avg IMDb Score', title='20 Most Frequent Genre Combinations and Avg Rating')
fig.update_traces(
textposition='outside',
texttemplate='%{text:.2f}',
textfont_size=16,
marker_color='#636EFA',
)
fig.update_layout(
xaxis_title = 'Genres',
yaxis_title = 'Avg IMDb Score',
title_x=0.5,
title_y=0.85,
width=1000,
xaxis_title_font=dict(size=17),
yaxis_title_font=dict(size=17),
title_font=dict(size=22, family='Arial Black'),
xaxis=dict(tickmode='linear',
tickfont=dict(size=14)),
yaxis=dict(tickmode='linear',
tickvals=unique_scores,
range=[min(unique_scores)-0.5, max(unique_scores)+0.5],
tickfont=dict(size=14)
),
template='plotly_white',
)
config = {
'toImageButtonOptions': {
'filename': '20 Most frequent Genre Combinations'
}
}
fig.show(config=config)
genre_data = df.copy()
genre_data.dropna(subset=['Genre'], inplace=True)
genre_data['Genre'] = genre_data['Genre'].apply(process_col)
# takes the first genre
def save_first_val(col):
col = col.split(',')
return col[0]
genre_data['Genre'] = genre_data['Genre'].apply(save_first_val)
# calculate the percentage of each value
genre_frequency = genre_data['Genre'].value_counts() / len(genre_data) * 100
genre_frequency = genre_frequency.reset_index()
print(genre_frequency.columns)
genre_frequency.columns = ['Genre', 'Percentage']
print(genre_frequency.columns)
genre_percentage = genre_frequency.sort_values(by='Percentage', ascending=False)
# save the top 20
# top_20_genres = genre_frequency.head(20)
Index(['Genre', 'count'], dtype='object') Index(['Genre', 'Percentage'], dtype='object')
fig = px.histogram(genre_percentage, x='Genre', y='Percentage', text_auto = True, title='Most Frequent Genre Combinations')
fig.update_traces(
texttemplate='%{y:.2f}%',
textfont=dict(size=50),
textposition='outside',
marker_color='#636EFA',
insidetextfont=dict(size=30),
outsidetextfont=dict(size=30),
)
fig.update_layout(
xaxis_title='Genres',
yaxis_title='Percentage',
title_x=0.5,
xaxis_title_font=dict(size=17),
yaxis_title_font=dict(size=17),
title_font=dict(size=22),
xaxis=dict(
tickmode='linear',
tickfont=dict(size=14)
),
yaxis=dict(
tickformat=".2f%%",
tickfont=dict(size=14),
),
template='plotly_white',
)
config = {
'toImageButtonOptions': {
'filename': 'Most frequent Genre Combinations'
}
}
fig.show(config=config)
fig = px.bar(genre_percentage, x='Genre', y='Percentage', text='Percentage', title='Genre frequencies (showed as percentage)')
fig.update_traces(
textposition='outside',
texttemplate='%{text:.2f}',
textfont_size= 30,
marker_color='#636EFA',
)
fig.update_layout(
height = 600,
xaxis_title = 'Genres',
yaxis_title = 'Percentage',
title_x=0.5,
xaxis_title_font=dict(size=17),
yaxis_title_font=dict(size=17),
title_font=dict(size=22),
xaxis=dict(tickmode='linear',
tickfont=dict(size=14)),
yaxis=dict(tickmode='linear',
tickfont=dict(size=14)
),
template='plotly_white',
)
config = {
'toImageButtonOptions': {
'filename': '20 Most frequent Genre Combinations'
}
}
fig.show(config=config)
genre_mean_imdb = genre_data.groupby('Genre')['IMDb Score'].mean().reset_index()
print(genre_mean_imdb.columns)
genre_mean_imdb.columns = ['Genre', 'Avg IMDb Score']
genre_stats = pd.merge(genre_percentage, genre_mean_imdb, on='Genre')
Index(['Genre', 'IMDb Score'], dtype='object')
# Plotting the top 30 genres with Avg IMDb Score
fig = px.bar(genre_stats, x='Genre', y='Avg IMDb Score', text='Avg IMDb Score', title='Avg Rating by Genre')
fig.update_traces(
textposition='outside',
texttemplate='%{text:.2f}%',
textfont_size=16,
marker_color='#636EFA',
)
fig.update_layout(
xaxis_title='Genres',
yaxis_title='Avg IMDb Score',
title_x=0.5,
title_y=0.85,
xaxis_title_font=dict(size=17),
yaxis_title_font=dict(size=17),
title_font=dict(size=22, family='Arial Black'),
xaxis=dict(
tickmode='linear',
tickfont=dict(size=14)
),
yaxis=dict(
tickmode='linear',
tickvals=unique_scores,
range=[min(unique_scores)-0.5, max(unique_scores)+0.5],
tickfont=dict(size=14)
),
template='plotly_white',
)
config = {
'toImageButtonOptions': {
'filename': '20 Most frequent Genre Combinations'
}
}
fig.show(config=config)
# copy to a new dataframe, delete nulls, and process text
director_data = df.copy()
director_data.dropna(subset=['Director'], inplace=True)
print(director_data['Director'].head(15))
0 Tomas Alfredson 1 Coky Giedroyc 2 Brendan Walsh 4 Stephen Irwin 5 Mez Tharatorn 8 Alf Sjöberg 9 Lasse Åberg 10 Jon Holmberg 11 David S. Goyer 12 Hans Alfredson 13 Lasse Åberg 14 José Esteban Alenda, César Esteban Alenda 15 Todd Phillips 16 George Lucas 17 David Yates Name: Director, dtype: object
# pre-process category by keeping the first value in each row
def pre_process_category(value):
if isinstance(value, str):
value.lower()
list_val = value.split(',')
return list_val[0].strip()
else:
return value
director_data['Director'] = director_data['Director'].apply(pre_process_category)
print(director_data['Director'].head(15))
0 Tomas Alfredson 1 Coky Giedroyc 2 Brendan Walsh 4 Stephen Irwin 5 Mez Tharatorn 8 Alf Sjöberg 9 Lasse Åberg 10 Jon Holmberg 11 David S. Goyer 12 Hans Alfredson 13 Lasse Åberg 14 José Esteban Alenda 15 Todd Phillips 16 George Lucas 17 David Yates Name: Director, dtype: object
# plot director
# calculate the percentage of each value
director_data_freq = director_data['Director'].value_counts() / len(director_data) * 100
director_data_freq = director_data_freq.reset_index()
print(director_data_freq.columns)
director_data_freq.columns = ['Director', 'Percentage']
print(director_data_freq.columns)
director_data_percentage = director_data_freq.sort_values(by='Percentage', ascending=False)
# Save the top 50 directors
top_50_directors = director_data_percentage.head(50)
Index(['Director', 'count'], dtype='object') Index(['Director', 'Percentage'], dtype='object')
# Plotting the histogram
fig = px.histogram(top_50_directors, x='Director', y='Percentage', text_auto = True, title='Most Frequent Directors Combinations')
fig.update_traces(
texttemplate='%{y:.2f}%',
textfont=dict(size=50),
textposition='outside',
marker_color='#636EFA',
insidetextfont=dict(size=30),
outsidetextfont=dict(size=30),
)
fig.update_layout(
xaxis_title='Director',
yaxis_title='Percentage',
title_x=0.5,
xaxis_title_font=dict(size=17),
yaxis_title_font=dict(size=17),
title_font=dict(size=22),
xaxis=dict(
tickmode='linear',
tickfont=dict(size=14)
),
yaxis=dict(
tickformat=".2f%%",
tickfont=dict(size=14),
),
template='plotly_white',
)
config = {
'toImageButtonOptions': {
'filename': 'Most frequent Genre Combinations'
}
}
fig.show(config=config)
# Avg rating by genre table
director_mean_imdb = director_data.groupby('Director')['IMDb Score'].mean().reset_index()
print(director_mean_imdb.columns)
director_mean_imdb.columns = ['Director', 'Avg IMDb Score']
director_stats = pd.merge(top_50_directors, director_mean_imdb, on='Director')
Index(['Director', 'IMDb Score'], dtype='object')
# Plotting the bar plot for top 30 genres with Avg IMDb Score
fig = px.bar(director_stats, x='Director', y='Avg IMDb Score', text='Avg IMDb Score', title='Avg Rating by Director')
fig.update_traces(
textposition='outside',
texttemplate='%{text:.2f}%',
textfont_size=16,
marker_color='#636EFA',
)
fig.update_layout(
xaxis_title='Director',
yaxis_title='Avg IMDb Score',
title_x=0.5,
title_y=0.85,
width=650,
xaxis_title_font=dict(size=17),
yaxis_title_font=dict(size=17),
title_font=dict(size=22, family='Arial Black'),
xaxis=dict(
tickmode='linear',
tickfont=dict(size=14)
),
yaxis=dict(
tickmode='linear',
tickvals=unique_scores,
range=[min(unique_scores)-0.5, max(unique_scores)+0.5],
tickfont=dict(size=14)
),
template='plotly_white',
)
config = {
'toImageButtonOptions': {
'filename': '20 Most frequent Genre Combinations'
}
}
fig.show(config=config)
# copy to a new dataframe, delete nulls, and process text
writer_data = df.copy()
writer_data.dropna(subset=['Writer'], inplace=True)
print(writer_data['Writer'].head(15))
writer_data['Writer'] = writer_data['Writer'].apply(pre_process_category)
print(writer_data['Writer'].head(15))
0 John Ajvide Lindqvist 1 Caitlin Moran 2 Brendan Walsh, Daley Nixon 5 Pattaranad Bhiboonsawade, Thodsapon Thiptinnak... 8 Ivar Lo-Johansson 9 Lasse Åberg, Bo Jonsson 10 Jon Holmberg, Daniella Mendel-Enk, Sara Young,... 11 Christine Roum, Mats Wahl, Mick Davis 12 Hans Alfredson 13 Lasse Åberg 14 José Esteban Alenda, Victoria Ruiz, César Este... 15 Scott Silver, Jerry Robinson, Todd Phillips, B... 16 George Lucas 17 Steve Kloves, J.K. Rowling 19 Hans Alfredson Name: Writer, dtype: object 0 John Ajvide Lindqvist 1 Caitlin Moran 2 Brendan Walsh 5 Pattaranad Bhiboonsawade 8 Ivar Lo-Johansson 9 Lasse Åberg 10 Jon Holmberg 11 Christine Roum 12 Hans Alfredson 13 Lasse Åberg 14 José Esteban Alenda 15 Scott Silver 16 George Lucas 17 Steve Kloves 19 Hans Alfredson Name: Writer, dtype: object
# plot director
# calculate the percentage of each value
writer_data_freq = writer_data['Writer'].value_counts() / len(director_data) * 100
writer_data_freq = writer_data_freq.reset_index()
print(writer_data_freq.columns)
writer_data_freq.columns = ['Writer', 'Percentage']
print(writer_data_freq.columns)
writer_data_percentage = writer_data_freq.sort_values(by='Percentage', ascending=False)
# Save the top 50 directors
top_50_writers = writer_data_percentage.head(50)
# Avg rating by genre table
writer_mean_imdb = writer_data.groupby('Writer')['IMDb Score'].mean().reset_index()
print(writer_mean_imdb.columns)
writer_mean_imdb.columns = ['Writer', 'Avg IMDb Score']
writer_stats = pd.merge(top_50_writers, writer_mean_imdb, on='Writer')
Index(['Writer', 'count'], dtype='object') Index(['Writer', 'Percentage'], dtype='object') Index(['Writer', 'IMDb Score'], dtype='object')
# Plotting the bar plot for top 30 genres with Avg IMDb Score
fig = px.bar(writer_stats, x='Writer', y='Avg IMDb Score', text='Avg IMDb Score', title='Avg Rating by Writer')
fig.update_traces(
textposition='outside',
texttemplate='%{text:.2f}%',
textfont_size=16,
marker_color='#636EFA',
)
fig.update_layout(
xaxis_title='Writer',
yaxis_title='Avg IMDb Score',
title_x=0.5,
title_y=0.85,
width=650,
xaxis_title_font=dict(size=17),
yaxis_title_font=dict(size=17),
title_font=dict(size=22, family='Arial Black'),
xaxis=dict(
tickmode='linear',
tickfont=dict(size=14)
),
yaxis=dict(
tickmode='linear',
tickvals=unique_scores,
range=[min(unique_scores)-0.5, max(unique_scores)+0.5],
tickfont=dict(size=14)
),
template='plotly_white',
)
config = {
'toImageButtonOptions': {
'filename': '20 Most frequent Genre Combinations'
}
}
fig.show(config=config)
df.columns
Index(['Title', 'Genre', 'Tags', 'Languages', 'Series or Movie', 'Runtime',
'Director', 'Writer', 'Actors', 'Release Date', 'Summary', 'IMDb Score',
'released_day', 'released_month', 'released_year'],
dtype='object')
# copy to a new dataframe, delete nulls, and process text
actor_data = df.copy()
actor_data.dropna(subset=['Actors'], inplace=True)
print(actor_data['Actors'].head(15))
actor_data['Actors'] = actor_data['Actors'].apply(pre_process_category)
print(actor_data['Actors'].head(15))
0 Kåre Hedebrant, Per Ragnar, Lina Leandersson, ... 1 Paddy Considine, Cleo, Beanie Feldstein, Dónal... 2 Genesis Rodriguez, Vincent Piazza 3 Vahide Perçin, Gonca Vuslateri, Cansu Dere, Be... 4 Ragga Gudrun 5 Thiti Mahayotaruk, Nadech Kugimiya, Kathaleeya... 6 Marcin Dorocinski, Piotr Nowak, Julia Kijowska... 7 Pawel Królikowski, Szymon Bobrowski, Danuta St... 8 Ulf Palme, Ragnar Falck, Hugo Björne, Eva Dahl... 9 Jon Skolmen, Cecilia Walton, Lasse Åberg, Eva ... 10 Elis Gerdt, Tea Stjärne, Fredrik Hallgren, Bax... 11 Marcia Gay Harden, Chris Marquette, Margarita ... 12 Stellan Skarsgård, Hans Alfredson, Per Myrberg... 13 Jon Skolmen, Ida Högberg, Lasse Åberg, Tobias ... 14 Manolo Solo, Roger Príncep, Cristina Marcos, R... Name: Actors, dtype: object 0 Kåre Hedebrant 1 Paddy Considine 2 Genesis Rodriguez 3 Vahide Perçin 4 Ragga Gudrun 5 Thiti Mahayotaruk 6 Marcin Dorocinski 7 Pawel Królikowski 8 Ulf Palme 9 Jon Skolmen 10 Elis Gerdt 11 Marcia Gay Harden 12 Stellan Skarsgård 13 Jon Skolmen 14 Manolo Solo Name: Actors, dtype: object
# plot director
# calculate the percentage of each value
actor_data_freq = actor_data['Actors'].value_counts() / len(director_data) * 100
actor_data_freq = actor_data_freq.reset_index()
print(actor_data_freq.columns)
actor_data_freq.columns = ['Actors', 'Percentage']
print(actor_data_freq.columns)
actor_data_percentage = actor_data_freq.sort_values(by='Percentage', ascending=False)
# Save the top 50 directors
top_50_actors = actor_data_percentage.head(50)
# Avg rating by genre table
actor_mean_imdb = actor_data.groupby('Actors')['IMDb Score'].mean().reset_index()
print(actor_mean_imdb.columns)
actor_mean_imdb.columns = ['Actors', 'Avg IMDb Score']
actor_stats = pd.merge(top_50_actors, actor_mean_imdb, on='Actors')
Index(['Actors', 'count'], dtype='object') Index(['Actors', 'Percentage'], dtype='object') Index(['Actors', 'IMDb Score'], dtype='object')
# Plotting the bar plot for top 30 genres with Avg IMDb Score
fig = px.bar(actor_stats, x='Actors', y='Avg IMDb Score', text='Avg IMDb Score', title='Avg Rating by Actor')
fig.update_traces(
textposition='outside',
texttemplate='%{text:.2f}%',
textfont_size=16,
marker_color='#636EFA',
)
fig.update_layout(
xaxis_title='Actors',
yaxis_title='Avg IMDb Score',
title_x=0.5,
title_y=0.85,
width=650,
xaxis_title_font=dict(size=17),
yaxis_title_font=dict(size=17),
title_font=dict(size=22, family='Arial Black'),
xaxis=dict(
tickmode='linear',
tickfont=dict(size=14)
),
yaxis=dict(
tickmode='linear',
tickvals=unique_scores,
range=[min(unique_scores)-0.5, max(unique_scores)+0.5],
tickfont=dict(size=14)
),
template='plotly_white',
)
config = {
'toImageButtonOptions': {
'filename': '20 Most frequent Genre Combinations'
}
}
fig.show(config=config)